from mate_data_loader import is_chinese
import glob

datadir = 'D:/workspace/res_input_output/raw_input_bak'

file_iter = [f for i, f in enumerate(glob.iglob(datadir + '/**/*.txt', recursive=True))]

term_freq = {}
for i, f in enumerate(file_iter):
    for line in open(f, 'r', encoding='utf-8'):
        toks = list(line)
        for t in toks:
            if is_chinese(t):
                term_freq[t] = term_freq.get(t, 0) + 1

    if i >= 1000:
        break

kv = sorted(term_freq.items(), key=lambda k:k[1], reverse=True)

with open('./output/char_freq.txt', 'w', encoding='utf-8') as fw:
    for k, v in kv[:3000]:
        print(f'{k} {v}', file=fw)
